# Kernel: hconv_bprop_C128_N64

# Copyright 2014 Nervana Systems Inc. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#    http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

<CONSTANT_MAPPING>
    addr_zero  : 4x<128*8*2 + 64*8*2 + 0>
    addr_lut   : 4x<128*8*2 + 64*8*2 + 4>
    gridDimF   : c[0x0][0x14]
    gridDimE   : c[0x0][0x18]

    param_Rand[0]      : c[0x0][0x140]
    param_Rand[1]      : c[0x0][0x144]
    param_I[0]         : c[0x0][0x148]
    param_I[1]         : c[0x0][0x14c]
    param_F[0]         : c[0x0][0x150]
    param_F[1]         : c[0x0][0x154]
    param_E[0]         : c[0x0][0x158]
    param_E[1]         : c[0x0][0x15c]
    param_alpha        : c[0x0][0x160]
    param_flags        : c[0x0][0x164]
    param_N            : c[0x0][0x168]
    param_K            : c[0x0][0x16c]
    param_D            : c[0x0][0x170]
    param_H            : c[0x0][0x174]
    param_W            : c[0x0][0x178]
    param_WN           : c[0x0][0x17c]
    param_HWN          : c[0x0][0x180]
    param_DHWN         : c[0x0][0x184]
    param_C            : c[0x0][0x188]
    param_CRST         : c[0x0][0x18c]
    param_RST          : c[0x0][0x190]
    param_magic_RST    : c[0x0][0x194]
    param_shift_RST    : c[0x0][0x198]
    param_RS           : c[0x0][0x19c]
    param_magic_RS     : c[0x0][0x1a0]
    param_shift_RS     : c[0x0][0x1a4]
    param_S            : c[0x0][0x1a8]
    param_magic_S      : c[0x0][0x1ac]
    param_shift_S      : c[0x0][0x1b0]
    param_pad_d        : c[0x0][0x1b4]
    param_pad_h        : c[0x0][0x1b8]
    param_pad_w        : c[0x0][0x1bc]
    param_str_d        : c[0x0][0x1c0]
    param_str_h        : c[0x0][0x1c4]
    param_str_w        : c[0x0][0x1c8]
    param_P            : c[0x0][0x1cc]
    param_Q            : c[0x0][0x1d0]
    param_PQ           : c[0x0][0x1d4]
    param_QN           : c[0x0][0x1d8]
    param_PQN          : c[0x0][0x1dc]
    param_MPQN         : c[0x0][0x1e0]
    param_magic_Q      : c[0x0][0x1e4]
    param_shift_Q      : c[0x0][0x1e8]
    param_magic_PQ     : c[0x0][0x1ec]
    param_shift_PQ     : c[0x0][0x1f0]
    param_grid_P       : c[0x0][0x1f4]
    param_grid_Q       : c[0x0][0x1f8]
    param_grid_PQ      : c[0x0][0x1fc]
</CONSTANT_MAPPING>

<REGISTER_MAPPING>

    0-63    ~ blockMPQ, N, n, k, tid1, te, tracke, tf, magic_PQ, magic_Q, PQ, Q, QN, PQN, MPQN, HWN, WN, m, p, q, str_d, str_h, str_w, pad_d, pad_h, pad_w, crst, c, t, r, s, mt, pr, qs, x, y, z, i, x0, xW, y0, yH, z0, zD, lutStore, rst, rs, warp_cnt, RS, S, magic_S, magic_RS

    0-63    : czero<00-63>

     3, 2,11,10,19,18,27,26 : cx<0-7>y0
     7, 6,15,14,23,22,31,30 : cx<0-7>y1
     1, 0, 9, 8,17,16,25,24 : cx<0-7>y2
     5, 4,13,12,21,20,29,28 : cx<0-7>y3
    35,34,43,42,51,50,59,58 : cx<0-7>y4
    39,38,47,46,55,54,63,62 : cx<0-7>y5
    33,32,41,40,49,48,57,56 : cx<0-7>y6
    37,36,45,44,53,52,61,60 : cx<0-7>y7

    64-65   : Rand<0-1>
    66-95   ~ dimF, dimE, dimF_64, MPQ_dimE, xmad_tbid

    64-79   : j0Ex<0-7>, j0Fy<0-7>
    80-95   : j1Ex<0-7>, j1Fy<0-7>

    96-111  : loadE<0-3>,  loadF<0-7>,  loadFF<0-3>
    112-115 : trackE<0-1>, trackF<0-1>

    116-120 ~ writeEs, writeFs, K, MPQN8, swapBuf
    121-127 ~ readEs, readFs, tid, blockE, blockF, tbid, seed

    // full-lower
    116-127 : track00I<0-1>, track04I<0-1>, track08I<0-1>, track12I<0-1>, cs<0-3>

     // full
     64-65  : trackI<0-1>
     66-79  ~ crst<00|04|08|12>, writeCs, readCs, magic_RST, RST, DHWN1, lfsr<0-2>, alpha, seed_idx

     // rand
     80-115 ~ c<0-7>, exp<0-7>, rand<0-7>, lfsr<0-2>_1, lfsr<0-2>_2, rand

     // offset
     80-115 ~ c<00|04|08|12>, lut<00|04|08|12>, chan<00|04|08|12>, img<00|04|08|12>

     // temp
     80-115 ~ nn, tid31, tid31_1, tid96, clk_shf1, clk_shf2

</REGISTER_MAPPING>

--:-:1:-:1      S2R tid,      SR_TID.X;
--:-:2:-:1      S2R blockF,   SR_CTAID.Y;
--:-:3:-:1      S2R blockMPQ, SR_CTAID.X;
--:-:4:-:1      S2R blockE,   SR_CTAID.Z;

<SCHEDULE_BLOCK>
01:-:-:-:1      ISETP.GE.AND P0, PT, tid, 32, PT;

--:-:-:-:1      MOV K,           param_K;
--:-:-:-:1      MOV N,           param_N;
--:-:-:-:1      MOV Q,           param_Q;
--:-:-:-:1      MOV PQ,          param_PQ;
--:-:-:-:1      MOV QN,          param_QN;
--:-:-:-:1      MOV PQN,         param_PQN;
--:-:-:-:1      MOV MPQN,        param_MPQN;
--:-:-:-:1      MOV magic_PQ,    param_magic_PQ;
--:-:-:-:1      MOV magic_Q,     param_magic_Q;

--:-:-:-:1      STS.128 [addr_zero], RZ;

// Grab a seed for this thread
// tbid = blockMPQ*dimE*dimF*64 + blockE*dimF*64 + blockF*64 + tid
--:-:-:-:1      ISETP.NE.AND P4, PT, RZ, param_flags, PT;
--:-:-:-:1      MOV dimF, gridDimF;
--:-:-:-:1      MOV dimE, gridDimE;
03:-:-:-:1      ISCADD tbid, blockF, tid, 6;
--:-:-:-:1      SHL dimF_64, dimF, 6;
08:-:-:-:1      XMAD.LO2 tbid, dimF_64, blockE, tbid;
04:-:-:-:1      XMAD MPQ_dimE, blockMPQ, dimE, RZ;
--:-:-:-:1      XMAD.LO tbid, MPQ_dimE, dimF_64, tbid, xmad_tbid;
--:-:-:-:1      LOP.AND seed, tbid, 1x<2048*32 - 1>;
--:-:-:-:1      LEA      Rand0.CC, seed, param_Rand[0],     0x2;
--:-:-:-:1      LEA.HI.X Rand1,    seed, param_Rand[1], RZ, 0x2;
--:-:-:-:1  @P4 LDG.E.CS seed, [Rand];

// m        = blockMPQ / PQ
// blockMPQ = blockMPQ % PQ
04:-:-:-:1      XMAD.LO2 m, magic_PQ, blockMPQ, RZ;
--:-:-:-:1      SHR.U32 m, m, param_shift_PQ;
--:-:-:-:1      VMAD.U16.U16 blockMPQ, -m, PQ, blockMPQ;
// p = blockMPQ / Q
// q = blockMPQ % Q
--:-:-:-:1      XMAD.LO2 p, magic_Q, blockMPQ, RZ;
--:-:-:-:1      SHR.U32 p, p, param_shift_Q;
--:-:-:-:1      VMAD.U16.U16 q, -p, Q, blockMPQ;

// n = (tid & 15) << 2
// k = (tid >> 4) & 7
--:-:-:-:1      LOP.AND n, tid,  15;
--:-:-:-:1      SHL     n, n, 2;
--:-:-:-:1      BFE.U32 k, tid,  0x304; // 3 bits at position 4

// trackE = E + k*MPQN + m*PQN + p*QN + q*N + blockE*64 + n
08:-:-:-:1      ISCADD tracke, blockE, n, 6;
--:-:-:-:1      XMAD         te, N,    q, tracke;
--:-:-:-:1      XMAD.LO2     te, QN,   p, te;
--:-:-:-:1      XMAD.LO2     te, PQN,  m, te;
--:-:-:-:1      XMAD.LO2     te, MPQN, k, te;
--:-:-:-:1      LEA      trackE0.CC, te, param_E[0], 0x1;
--:-:-:-:1      LEA.HI.X trackE1,    te, param_E[1], RZ, 0x1;

--:-:-:-:1      SHL MPQN8, MPQN, 4;

// crst = blockF*128 + tid
// trackF = F + crst * K
02:-:-:-:1      ISCADD  crst, blockF, tid, 7;
--:-:-:-:1      VMAD.U16.U16 tf, crst, K, RZ;
--:-:-:-:1      LEA      trackF0.CC, tf, param_F[0], 0x1;
--:-:-:-:0      LEA.HI.X trackF1,    tf, param_F[1], RZ, 0x1;

// doLoadF = crst < CRST
// doLoadE = te < N
--:-:-:-:1      ISETP.LT.AND P1, PT, crst, param_CRST, PT;
--:-:-:-:1      ISETP.LT.AND P3, PT, tracke, N, PT;


// writeEs = (64*k + n) * 4
--:-:-:-:1      ISCADD  writeEs, k, n, 6;
--:-:-:-:1      ISCADD  writeEs, writeEs, 4x<128*8>, 2;

// writeFs = tid * 4
--:-:-:-:1      SHL writeFs, tid, 2;

// readEs = ((tid >> 1) & 7) << 4 + 4x<8*64>;
--:-:-:-:1      BFE.U32 readEs, tid,    0x301; // 3 bits at position 1
--:-:-:-:1      ISCADD  readEs, readEs, 4x<128*8>, 4;
// readFs  = (((tid & 0x70) >> 3) | (tid & 1)) << 4;
--:-:-:-:1      LOP.AND tid1,   tid,    1;
--:-:-:-:1      LOP.AND readFs, tid,    0x70;
--:-:-:-:1      SHR.U32 readFs, readFs, 3;
--:-:-:-:1      LOP.OR  readFs, readFs, tid1;
--:-:-:-:1      SHL     readFs, readFs, 4;

--:-:-:-:1      MOV32I swapBuf, 4x<64*8 + 128*8>;

--:-:-:-:1      PSETP.AND.AND P6, PT, PT, PT, PT;


</SCHEDULE_BLOCK>

--:-:-:-:5  @P0 BRA.U END_SETUP;

<SCHEDULE_BLOCK>
--:-:-:-:1      MOV warp_cnt,    32;
--:-:-:-:1      MOV rst,         tid;
--:-:-:-:1      MOV RS,          param_RS;
--:-:-:-:1      MOV magic_RS,    param_magic_RS;
--:-:-:-:1      MOV S,           param_S;
--:-:-:-:1      MOV magic_S,     param_magic_S;
--:-:-:-:1      MOV pad_d,       param_pad_d;
--:-:-:-:1      MOV pad_h,       param_pad_h;
--:-:-:-:1      MOV pad_w,       param_pad_w;
--:-:-:-:1      MOV str_d,       param_str_d;
--:-:-:-:1      MOV str_h,       param_str_h;
--:-:-:-:1      MOV str_w,       param_str_w;
--:-:-:-:1      MOV WN,          param_WN;
--:-:-:-:1      MOV HWN,         param_HWN;
// mt = m * w - pad_d 
// pr = p * u - pad_h 
// qs = q * v - pad_w
--:-:-:-:1      VMAD.U16.U16 mt, m, str_d, -pad_d;
--:-:-:-:1      VMAD.U16.U16 pr, p, str_h, -pad_h;
--:-:-:-:1      VMAD.U16.U16 qs, q, str_w, -pad_w;
</SCHEDULE_BLOCK>

LUT_LOOP:

<SCHEDULE_BLOCK>
// warp synchronous loop while warp_cnt < RST (c=0)
--:-:-:-:1      ISETP.LT.AND P0, PT, warp_cnt, param_RST, PT;
--:-:-:-:1      IADD warp_cnt, warp_cnt, 32;
// t =  rst / RS
// rs = rst % RS
--:-:-:-:1      XMAD.LO2 t, magic_RS, rst, RZ;
--:-:-:-:1      SHR.U32 t, t, param_shift_RS;
--:-:-:-:1      VMAD.U16.U16 rs, -t, RS, rst;
// r = rs / S
// s = rs % S
--:-:-:-:1      XMAD.LO2 r, magic_S, rs, RZ;
--:-:-:-:1      SHR.U32 r, r, param_shift_S;
--:-:-:-:1      VMAD.U16.U16 s, -r, S, rs;
// x = qs + s
// y = pr + r
// z = mt + t
--:-:-:-:1      IADD x, qs, s;
--:-:-:-:1      IADD y, pr, r;
--:-:-:-:1      IADD z, mt, t;
// i = (z*HWN + y*WN + x*N) * 2
20:-:-:-:1      XMAD     i, N,   x, RZ;
--:-:-:-:1      XMAD.LO2 i, WN,  y, i;
--:-:-:-:1      XMAD.LO2 i, HWN, z, i;
--:-:-:-:1      SHL i, i, 1;
// Bounds check x and y, and make i negative if outside
--:-:-:-:1      ISET.LT.AND x0, x, RZ, PT;
--:-:-:-:1      ISET.GE.AND xW, x,  param_W, PT;
--:-:-:-:1      ISET.LT.AND y0, y, RZ, PT;
--:-:-:-:1      ISET.GE.AND yH, y,  param_H, PT;
--:-:-:-:1      ISET.LT.AND z0, z, RZ, PT;
--:-:-:-:1      ISET.GE.AND zD, z,  param_D, PT;
--:-:-:-:1      LOP3.LUT i, i, x0, xW, 0xfe;
<ORDERED>
--:-:-:-:1      LOP3.LUT i, i, y0, yH, 0xfe;
--:-:-:-:1      SHL lutStore, rst, 2;
--:-:-:-:1      IADD rst, rst, 32;
</ORDERED>
--:-:-:-:1      LOP3.LUT i, i, z0, zD, 0xfe;
// Store i imgOffset into the shared lookup table
--:6:-:-:1      STS [lutStore + addr_lut], i;
</SCHEDULE_BLOCK>

--:-:-:-:5  @P0 BRA.U LUT_LOOP;

END_SETUP:

<CODE>
    return join '', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15;
</CODE>

//<CODE>
//    return join '', map sprintf("--:-:-:-:1      MOV czero%02d, RZ;\n", $_ ), 32..63;
//</CODE>

--:-:1:-:1  @P3 LDG.E.CI.64 loadE0, [trackE];
--:-:2:-:1 @!P3 LDS.U.64 loadE0, [addr_zero];

--:-:5:-:1  @P1 LDG.E.CI.128 loadF0, [trackF + 2x<0>];
--:4:6:-:1  @P1 LDG.E.CI.128 loadFF, [trackF + 2x<8>];
--:-:3:-:1 @!P1 LDS.U.128 loadF0, [addr_zero];
--:-:-:-:1 @!P1 LDS.U.128 loadFF, [addr_zero];

03:-:-:-:4      F2F.F32.F16 loadE3, loadE1.H1;
--:-:-:-:0      IADD trackE0.CC, trackE0, MPQN8;
--:-:-:-:4      F2F.F32.F16 loadE2, loadE1.H0;
--:-:-:-:4      F2F.F32.F16 loadE1, loadE0.H1;
--:-:1:-:2      F2F.F32.F16 loadE0, loadE0.H0;

--:-:-:-:0      IADD.X trackE1,  trackE1, RZ;

01:-:-:-:1      STS.128 [writeEs], loadE0;

14:-:-:-:4      F2F.F32.F16 loadF7, loadF3.H1;
--:-:-:-:4      F2F.F32.F16 loadF6, loadF3.H0;
--:-:-:-:4      F2F.F32.F16 loadF5, loadF2.H1;
--:-:1:-:4      F2F.F32.F16 loadF4, loadF2.H0;

08:-:-:-:0      IADD trackF0.CC, trackF0, 2x<16>;

--:-:-:-:4      F2F.F32.F16 loadF3, loadF1.H1;
--:-:-:-:4      F2F.F32.F16 loadF2, loadF1.H0;
--:-:-:-:4      F2F.F32.F16 loadF1, loadF0.H1;
--:-:2:-:1      F2F.F32.F16 loadF0, loadF0.H0;

--:-:-:-:0      IADD.X trackF1,  trackF1, RZ;

01:-:-:-:1      STS [writeFs + 4x<4*128>], loadF4;
--:-:-:-:1      STS [writeFs + 4x<5*128>], loadF5;
--:-:-:-:1      STS [writeFs + 4x<6*128>], loadF6;
--:-:-:-:1      STS [writeFs + 4x<7*128>], loadF7;

02:-:-:-:1      STS [writeFs + 4x<0*128>], loadF0;
--:-:-:-:1      STS [writeFs + 4x<1*128>], loadF1;
--:-:-:-:1      STS [writeFs + 4x<2*128>], loadF2;
--:-:-:-:1      STS [writeFs + 4x<3*128>], loadF3;

--:-:-:-:5      BAR.SYNC 0;
--:-:-:-:1      IADD writeEs, writeEs, swapBuf;
--:-:-:-:1      IADD writeFs, writeFs, swapBuf;
--:-:-:-:0      IADD swapBuf, RZ, -swapBuf;

--:-:1:-:1      LDS.U.128 j0Ex0, [readEs + 4x<0*64  + 00>];
--:-:1:-:1      LDS.U.128 j0Fy0, [readFs + 4x<0*128 + 00>];
--:-:1:-:1      LDS.U.128 j0Ex4, [readEs + 4x<0*64  + 32>];
--:-:1:-:1      LDS.U.128 j0Fy4, [readFs + 4x<0*128 + 64>];

NEXT_8K:

--:-:-:-:1      ISETP.GE.AND P4, PT, K, 16, P3;
<CODE>
    my %insert =
    (
        j0c1  => "--:-:-:-:1      PSETP.AND.AND P6, PT, !P6, PT, PT;\n",
        j0c3  => "--:-:-:-:1      ISETP.GE.AND P0, PT, K, 16, PT;\n",
        j0c7  => "--:-:-:-:1      IADD K, K, -8;\n",
        j0c15 => "--:-:-:-:1      PSETP.AND.AND P2, PT, P0, P1, P6;\n",
        
        j0c9  => "--:-:2:-:1  \@P4 LDG.E.CI.64  loadE0, [trackE];\n",

        j0c29 => "--:-:5:-:1  \@P2 LDG.E.CI.128 loadF0, [trackF + 2x<0>];\n",
        j0c31 => "20:4:6:-:1  \@P2 LDG.E.CI.128 loadFF, [trackF + 2x<8>];\n",


        j4c18 => "--:-:-:-:1 \@!P6 F2F.F32.F16 loadF7, loadFF3.H1;\n",
        j4c22 => "--:-:-:-:1 \@!P6 F2F.F32.F16 loadF6, loadFF3.H0;\n",
        j4c26 => "--:-:-:-:1 \@!P6 F2F.F32.F16 loadF5, loadFF2.H1;\n",
        j4c30 => "--:-:-:-:1 \@!P6 F2F.F32.F16 loadF4, loadFF2.H0;\n",
        j4c34 => "--:-:-:-:1 \@!P6 F2F.F32.F16 loadF3, loadFF1.H1;\n",
        j4c38 => "--:-:-:-:1 \@!P6 F2F.F32.F16 loadF2, loadFF1.H0;\n",
        j4c42 => "--:-:-:-:1 \@!P6 F2F.F32.F16 loadF1, loadFF0.H1;\n",
        j4c46 => "--:-:-:-:1 \@!P6 F2F.F32.F16 loadF0, loadFF0.H0;\n",

        j5c5  => "02:-:-:-:1  \@P4 F2F.F32.F16 loadE3, loadE1.H1;\n",
        j5c9  => "--:-:-:-:1  \@P4 F2F.F32.F16 loadE2, loadE1.H0;\n",
        j5c13 => "--:-:-:-:1  \@P4 F2F.F32.F16 loadE1, loadE0.H1;\n",
        j5c17 => "--:-:2:-:1  \@P4 F2F.F32.F16 loadE0, loadE0.H0;\n",

        j5c33 => "02:-:-:-:1  \@P0 STS.128 [writeEs], loadE0;\n",

        j5c42 => "--:-:-:-:1  \@P4 IADD   trackE0.CC, trackE0, MPQN8;\n",
        j5c47 => "--:-:-:-:1  \@P4 IADD.X trackE1,    trackE1, RZ;\n",

        j5c57 => "10:-:-:-:1  \@P6 F2F.F32.F16 loadF7, loadF3.H1;\n",
        j5c61 => "--:-:-:-:1  \@P6 F2F.F32.F16 loadF6, loadF3.H0;\n",
        j6c1  => "--:-:-:-:1  \@P6 F2F.F32.F16 loadF5, loadF2.H1;\n",
        j6c5  => "--:-:-:-:1  \@P6 F2F.F32.F16 loadF4, loadF2.H0;\n",
        j6c9  => "--:-:2:-:1  \@P6 F2F.F32.F16 loadF3, loadF1.H1;\n",
        j6c13 => "--:-:3:-:1  \@P6 F2F.F32.F16 loadF2, loadF1.H0;\n",
        j6c17 => "08:-:4:-:1  \@P6 F2F.F32.F16 loadF1, loadF0.H1;\n",
        j6c21 => "--:-:5:-:1  \@P6 F2F.F32.F16 loadF0, loadF0.H0;\n",


        j6c25 => "02:-:-:-:1  \@P0 STS [writeFs + 4x<7*128>], loadF7;\n",
        j6c27 => "--:-:-:-:1  \@P0 STS [writeFs + 4x<6*128>], loadF6;\n",
        j6c29 => "--:-:-:-:1  \@P0 STS [writeFs + 4x<5*128>], loadF5;\n",
        j6c31 => "--:-:-:-:1  \@P0 STS [writeFs + 4x<4*128>], loadF4;\n",
        j6c33 => "--:-:-:-:1  \@P0 STS [writeFs + 4x<3*128>], loadF3;\n",
        j6c35 => "04:-:-:-:1  \@P0 STS [writeFs + 4x<2*128>], loadF2;\n",
        j6c37 => "08:-:-:-:1  \@P0 STS [writeFs + 4x<1*128>], loadF1;\n",
        j6c39 => "10:-:-:-:1  \@P0 STS [writeFs + 4x<0*128>], loadF0;\n",

        j7c57 => "--:-:-:-:1  \@P2 IADD   trackF0.CC, trackF0, 2x<16>;\n",
        j7c62 => "--:-:-:-:1  \@P2 IADD.X trackF1,    trackF1, RZ;\n",

        j6c63 => "--:-:-:-:5  \@P0 BAR.SYNC 0;\n" .
                 "--:-:-:-:1  \@P0 IADD readEs,  readEs, -swapBuf;\n" .
                 "--:-:-:-:1  \@P0 IADD readFs,  readFs, -swapBuf;\n" .
                 "--:-:-:-:1  \@P0 IADD writeEs, writeEs, swapBuf;\n" .
                 "--:-:-:-:1  \@P0 IADD writeFs, writeFs, swapBuf;\n" .
                 "--:-:-:-:1  \@P0 IADD swapBuf, RZ,     -swapBuf;\n",




        j7c63 => "--:-:-:Y:5  \@P0 BRA.U NEXT_8K;\n",
    );

    my @cOrder;
    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
    my @y = (0,1,4,5);
    foreach my $x (0,2,4,6)
    {
        foreach my $y (@y)
        {
            push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
        }
        @y = reverse @y;
    }

    my $out;
    foreach my $j (0 .. 7)
    {
        my $odd      = $j & 1;
        my $nOdd     = !$odd + 0;
        my $rsOffset = ($j + 1) % 8;
        my $rsPred   = $j == 7 ? '@P0' : '   ';

        $insert{"j${j}c0"} = sprintf "--:-:1:-:1  %s LDS.U.128 j%dEx0, [readEs + 4x<%d*64  + 00>];\n", $rsPred, $nOdd, $rsOffset;
        $insert{"j${j}c2"} = sprintf "--:-:1:-:1  %s LDS.U.128 j%dFy0, [readFs + 4x<%d*128 + 00>];\n", $rsPred, $nOdd, $rsOffset;
        $insert{"j${j}c4"} = sprintf "--:-:1:-:1  %s LDS.U.128 j%dEx4, [readEs + 4x<%d*64  + 32>];\n", $rsPred, $nOdd, $rsOffset;
        $insert{"j${j}c6"} = sprintf "--:-:1:-:1  %s LDS.U.128 j%dFy4, [readFs + 4x<%d*128 + 64>];\n", $rsPred, $nOdd, $rsOffset;

        foreach my $c (0 .. 63)
        {
            my ($x,$y) = @{$cOrder[$c]};

            my $ins    = $insert{"j${j}c$c"} || '';
            
            my $stall  = $ins =~ /LDS|F2F|I2F|LDG|STS|BAR|BRA/ ? 0 : 1;

            my $yield  = $c == 32 && $stall ? 'Y' : '-';

            my $wait   = $c == 0 ? '01' : '--';

            my $ctrl   = "$wait:-:-:$yield:$stall";

            $out .= sprintf "%s      FFMA cx%dy%d, j%dEx%d, j%dFy%d, cx%dy%d;\n%s", $ctrl,  $x,$y,  $odd,$x,  $odd,$y,  $x,$y,  $ins;
        }
    }
    return $out;

</CODE>


<SCHEDULE_BLOCK>
--:-:-:-:1      MOV magic_RST, param_magic_RST;
--:-:-:-:1      MOV RST,       param_RST;
--:-:-:-:1      MOV DHWN1,     param_DHWN;
--:-:-:-:1      SHL DHWN1,     DHWN1, 1;

--:-:-:-:1      LOP.AND readEs, readEs, 0xff;
--:-:-:-:1      LOP.AND readFs, readFs, 0xff;

// writeCs = ((readIs / 4) * 64 + readEs) / 2;
--:-:-:-:1      ISCADD  writeCs, readFs, readEs, 4;
--:-:-:-:1      SHR.U32 writeCs, writeCs, 1;

// readCs = (tid96 << 4) + (tid31 << 2)
--:-:-:-:1      LOP.AND tid31, tid, 31;
--:-:-:-:1      LOP.AND tid96, tid, 96;
--:-:-:-:1      SHL readCs, tid31, 2;
--:-:-:-:1      ISCADD readCs, tid96, readCs, 4;

// nn = blockE*64 + (tid31 << 1);
--:-:-:-:1      SHL tid31_1, tid31, 1;
--:-:-:-:1      ISCADD nn, blockE, tid31_1, 6;

// n < N
--:-:-:-:1      ISETP.LT.AND P5, PT, nn, param_N, PT; 

// crst = blockF*128 + (tid96 >> 1)
--:-:-:-:1      SHR.U32 crst00, tid96, 1;
--:-:-:-:1      ISCADD  crst00, blockF, crst00, 7;
--:-:-:-:1      IADD    crst04, crst00, 4;
--:-:-:-:1      IADD    crst08, crst00, 8;
--:-:-:-:1      IADD    crst12, crst00, 12;

--:-:-:-:1      LEA      trackI0.CC, nn, param_I[0],     0x1;
--:-:-:-:1      LEA.HI.X trackI1,    nn, param_I[1], RZ, 0x1;

--:-:-:-:1      MOV alpha, param_alpha;

// Seed the Tausworthe
--:-:-:-:1      LOP.XOR lfsr0, seed, tbid;
--:-:-:-:1      CS2R lfsr1, SR_CLOCKLO;
--:-:-:-:1      CS2R lfsr2, SR_GLOBALTIMERLO;
--:-:-:-:1      LOP.AND clk_shf1, lfsr1, 31;
--:-:-:-:1      LOP.AND clk_shf2, lfsr2, 31;
--:-:-:-:1      LOP.XOR clk_shf1, clk_shf1, tid31;
--:-:-:-:1      LOP.XOR clk_shf2, clk_shf2, tid31;
--:-:-:-:1      SHF.R.U64 lfsr1, lfsr1, clk_shf1, lfsr1;
--:-:-:-:1      SHF.R.U64 lfsr2, lfsr2, clk_shf2, lfsr2;
--:-:-:-:1      LOP.AND seed_idx, tbid, 1x<2048*32 - 1>;

// p6 = flags != 0 (stochastic rounding)
--:-:-:-:1      ISETP.NE.AND P6, PT, RZ, param_flags, PT; 

</SCHEDULE_BLOCK>

--:-:-:-:5      BAR.SYNC 0;

<CODE>

    my $out;
    foreach my $y (0..7)
    {
        $out .=
            "--:-:-:-:1      IADD crst00, crst00, 60;\n" .
            "--:-:-:-:1      IADD crst04, crst04, 60;\n" .
            "--:-:-:-:1      IADD crst08, crst08, 60;\n" .
            "--:-:-:-:1      IADD crst12, crst12, 60;\n" if $y == 4;

        $out .= sprintf(
            "--:-:-:-:1      FMUL c0, cx0y%d, alpha;\n" .
            "--:-:-:-:1      FMUL c1, cx1y%d, alpha;\n" .
            "--:-:-:-:1      FMUL c2, cx2y%d, alpha;\n" .
            "--:-:-:-:1      FMUL c3, cx3y%d, alpha;\n" .
            "--:-:-:-:1      FMUL c4, cx4y%d, alpha;\n" .
            "--:-:-:-:1      FMUL c5, cx5y%d, alpha;\n" .
            "--:-:-:-:1      FMUL c6, cx6y%d, alpha;\n" .
            "--:-:-:-:0      FMUL c7, cx7y%d, alpha;\n",
            ($y) x 8);

        $out .= "--:-:-:-:5      CAL STORE_C;\n\n";
    }
    return $out;

</CODE>

// Update the seed
--:-:-:-:6  @P6 LEA      Rand0.CC, seed_idx, param_Rand[0],     0x2;
--:-:-:-:2  @P6 LEA.HI.X Rand1,    seed_idx, param_Rand[1], RZ, 0x2;
--:-:-:-:1  @P6 STG.E.CS [Rand], rand;

--:-:-:-:5      EXIT;

STORE_C:

--:-:-:-:5  @P6 BRA.U DO_RANDOM;

// Round nearest
--:-:-:-:4      F2F.F16.F32 c0, c0;
--:-:1:-:4      F2F.F16.F32 c1, c1;
--:-:-:-:4      F2F.F16.F32 c2, c2;
--:-:2:-:4      F2F.F16.F32 c3, c3;
--:-:-:-:4      F2F.F16.F32 c4, c4;
--:-:3:-:4      F2F.F16.F32 c5, c5;
--:-:-:-:4      F2F.F16.F32 c6, c6;
--:-:4:-:1      F2F.F16.F32 c7, c7;

--:-:-:-:5      BRA.U END_ROUND;

DO_RANDOM:

// Round Random
--:-:-:-:5      CAL RANDOM_ROUND;

END_ROUND:

// Pack 2 16 bit values into 32 bit words
11:-:-:-:2      BFI cs0, c1, 0x1010, c0;
02:-:-:-:2      BFI cs1, c3, 0x1010, c2;
24:-:-:-:2      BFI cs2, c5, 0x1010, c4;
08:-:-:-:0      BFI cs3, c7, 0x1010, c6;

// Warp shuffle to drop the awkward readAs/readBs mapping
--:-:-:-:4      STS.64 [writeCs+2x<00>], cs0;
--:-:-:-:1      STS.64 [writeCs+2x<32>], cs2;
--:-:-:-:1      LDS cs0, [readCs + 2x<0*64>];
--:-:-:-:1      LDS cs1, [readCs + 2x<1*64>];
--:-:-:-:1      LDS cs2, [readCs + 2x<2*64>];
--:-:5:-:1      LDS cs3, [readCs + 2x<3*64>];

<SCHEDULE_BLOCK>
--:-:-:-:1      ISETP.LT.AND P0, PT, crst00, param_CRST, P5;
--:-:-:-:1      ISETP.LT.AND P1, PT, crst04, param_CRST, P5;
--:-:-:-:1      ISETP.LT.AND P2, PT, crst08, param_CRST, P5;
--:-:-:-:1      ISETP.LT.AND P3, PT, crst12, param_CRST, P5;

--:-:-:-:1      XMAD.LO2 c00, magic_RST, crst00, RZ;
--:-:-:-:1      XMAD.LO2 c04, magic_RST, crst04, RZ;
--:-:-:-:1      XMAD.LO2 c08, magic_RST, crst08, RZ;
--:-:-:-:1      XMAD.LO2 c12, magic_RST, crst12, RZ;

--:-:-:-:1      SHR.U32 c00, c00, param_shift_RST;
--:-:-:-:1      SHR.U32 c04, c04, param_shift_RST;
--:-:-:-:1      SHR.U32 c08, c08, param_shift_RST;
--:-:-:-:1      SHR.U32 c12, c12, param_shift_RST;

--:-:-:-:1      VMAD.U16.U16 lut00, -c00, RST, crst00;
--:-:-:-:1      VMAD.U16.U16 lut04, -c04, RST, crst04;
--:-:-:-:1      VMAD.U16.U16 lut08, -c08, RST, crst08;
--:-:-:-:1      VMAD.U16.U16 lut12, -c12, RST, crst12;

--:-:-:-:1      SHL lut00, lut00, 2;
--:-:-:-:1      SHL lut04, lut04, 2;
--:-:-:-:1      SHL lut08, lut08, 2;
--:-:-:-:1      SHL lut12, lut12, 2;

--:-:-:-:1      XMAD.LO2 chan00, DHWN1, c00, RZ;
--:-:-:-:1      XMAD.LO2 chan04, DHWN1, c04, RZ;
--:-:-:-:1      XMAD.LO2 chan08, DHWN1, c08, RZ;
--:-:-:-:1      XMAD.LO2 chan12, DHWN1, c12, RZ;

--:-:-:-:1      IADD crst00, crst00, 1;
--:-:-:-:1      IADD crst04, crst04, 1;
--:-:-:-:1      IADD crst08, crst08, 1;
--:-:-:-:1      IADD crst12, crst12, 1;

--:-:1:-:1  @P0 LDS img00, [lut00 + addr_lut];
--:-:2:-:1  @P1 LDS img04, [lut04 + addr_lut];
--:-:3:-:1  @P2 LDS img08, [lut08 + addr_lut];
--:-:4:-:1  @P3 LDS img12, [lut12 + addr_lut];

</SCHEDULE_BLOCK>

11:-:-:-:1      IADD3  track00I0.CC, trackI0, img00, chan00;
--:-:-:-:5      ISETP.GE.AND P0, PT, img00, RZ, P0;
--:-:-:-:1      IADD.X track00I1,    trackI1, RZ;

02:-:-:-:1      IADD3  track04I0.CC, trackI0, img04, chan04;
--:-:-:-:5      ISETP.GE.AND P1, PT, img04, RZ, P1;
--:-:-:-:1      IADD.X track04I1,    trackI1, RZ;

04:-:-:-:1      IADD3  track08I0.CC, trackI0, img08, chan08;
--:-:-:-:5      ISETP.GE.AND P2, PT, img08, RZ, P2;
--:-:-:-:1      IADD.X track08I1,    trackI1, RZ;

08:-:-:-:1      IADD3  track12I0.CC, trackI0, img12, chan12;
--:-:-:-:5      ISETP.GE.AND P3, PT, img12, RZ, P3;
--:-:-:-:0      IADD.X track12I1,    trackI1, RZ;


--:-:-:-:2  @P0 RED.E.ADD.F16x2.FTZ.RN [track00I], cs0;
--:5:-:-:2  @P1 RED.E.ADD.F16x2.FTZ.RN [track04I], cs1;
--:-:-:-:4  @P2 RED.E.ADD.F16x2.FTZ.RN [track08I], cs2;
--:6:-:-:1  @P3 RED.E.ADD.F16x2.FTZ.RN [track12I], cs3;

--:-:-:-:5      RET;

RANDOM_ROUND:

<SCHEDULE_BLOCK>

// lfsr0 = ((lfsr0 & 0xfffffffe) << 12) ^ (((lfsr0 << 13) ^ lfsr0) >> 19);
--:-:-:-:1      LOP32I.AND lfsr0_1, lfsr0, 0xfffffffe;
--:-:-:-:1      SHL lfsr0_1, lfsr0_1, 12;
--:-:-:-:1      SHL lfsr0_2, lfsr0, 13;
--:-:-:-:1      LOP.XOR lfsr0_2, lfsr0_2, lfsr0;
--:-:-:-:1      SHR.U32 lfsr0_2, lfsr0_2, 19;
--:-:-:-:1      LOP.XOR lfsr0, lfsr0_1, lfsr0_2;

// lfsr1 = ((lfsr1 & 0xfffffff8) <<  4) ^ (((lfsr1 << 2)  ^ lfsr1) >> 25);
--:-:-:-:1      LOP32I.AND lfsr1_1, lfsr1, 0xfffffff8;
--:-:-:-:1      SHL lfsr1_1, lfsr1_1, 4;
--:-:-:-:1      SHL lfsr1_2, lfsr1, 2;
--:-:-:-:1      LOP.XOR lfsr1_2, lfsr1_2, lfsr1;
--:-:-:-:1      SHR.U32 lfsr1_2, lfsr1_2, 25;
--:-:-:-:1      LOP.XOR lfsr1, lfsr1_1, lfsr1_2;

// lfsr2 = ((lfsr2 & 0xfffffff0) << 11) ^ (((lfsr2 << 3)  ^ lfsr2) >> 11);
--:-:-:-:1      LOP32I.AND lfsr2_1, lfsr2, 0xfffffff0;
--:-:-:-:1      SHL lfsr2_1, lfsr2_1, 11;
--:-:-:-:1      SHL lfsr2_2, lfsr2, 3;
--:-:-:-:1      LOP.XOR lfsr2_2, lfsr2_2, lfsr2;
--:-:-:-:1      SHR.U32 lfsr2_2, lfsr2_2, 11;
--:-:-:-:1      LOP.XOR lfsr2, lfsr2_1, lfsr2_2;

// rand = lfsr0 ^ lfsr1 ^ lfsr2;
--:-:-:-:1      LOP3.LUT  rand, lfsr0, lfsr1, lfsr2, 0x96;

// Strip mantissa and leave sign+exponent
--:-:-:-:1      LOP32I.AND exp0, c0, 0xff800000;
--:-:-:-:1      LOP32I.AND exp1, c1, 0xff800000;
--:-:-:-:1      LOP32I.AND exp2, c2, 0xff800000;
--:-:-:-:1      LOP32I.AND exp3, c3, 0xff800000;
--:-:-:-:1      LOP32I.AND exp4, c4, 0xff800000;
--:-:-:-:1      LOP32I.AND exp5, c5, 0xff800000;
--:-:-:-:1      LOP32I.AND exp6, c6, 0xff800000;
--:-:-:-:1      LOP32I.AND exp7, c7, 0xff800000;

// Find the exponent that will shift 32 bits of integer data 
// out past the lsb of this number as an fp16
// exp *= 2^-10 * 2^-32  (2^-42)
--:-:-:-:1      FMUL32I exp0, exp0, 0x2a800000;
--:-:-:-:1      FMUL32I exp1, exp1, 0x2a800000;
--:-:-:-:1      FMUL32I exp2, exp2, 0x2a800000;
--:-:-:-:1      FMUL32I exp3, exp3, 0x2a800000;
--:-:-:-:1      FMUL32I exp4, exp4, 0x2a800000;
--:-:-:-:1      FMUL32I exp5, exp5, 0x2a800000;
--:-:-:-:1      FMUL32I exp6, exp6, 0x2a800000;
--:-:-:-:1      FMUL32I exp7, exp7, 0x2a800000;

--:-:-:-:1      MOV rand0, rand;
</SCHEDULE_BLOCK>

// generate 8 other rotations of this rand
// Convert rand to float
// Scale rand and add to value
// Convert to fp16
--:-:-:-:2      SHF.R.U64 rand1, rand,  4, rand;
--:-:-:-:2      SHF.R.U64 rand2, rand,  8, rand;
--:-:-:-:0      SHF.R.U64 rand3, rand, 12, rand;
--:-:-:-:4      I2F.F32.U32.RZ rand0, rand0;
--:-:-:-:0      SHF.R.U64 rand4, rand, 16, rand;
--:-:1:-:4      I2F.F32.U32.RZ rand1, rand1;
--:-:-:-:0      SHF.R.U64 rand5, rand, 20, rand;
--:-:-:-:4      I2F.F32.U32.RZ rand2, rand2;
--:-:-:-:0      SHF.R.U64 rand6, rand, 24, rand;
--:-:2:-:4      I2F.F32.U32.RZ rand3, rand3;
--:-:-:-:0      SHF.R.U64 rand7, rand, 28, rand;
--:-:-:-:4      I2F.F32.U32.RZ rand4, rand4;
01:-:-:-:0      FFMA.RZ c0, rand0, exp0, c0;
--:-:3:-:4      I2F.F32.U32.RZ rand5, rand5;
--:-:-:-:0      FFMA.RZ c1, rand1, exp1, c1;
--:-:-:-:4      I2F.F32.U32.RZ rand6, rand6;
02:-:-:-:0      FFMA.RZ c2, rand2, exp2, c2;
--:-:4:-:4      I2F.F32.U32.RZ rand7, rand7;
--:-:-:-:0      FFMA.RZ c3, rand3, exp3, c3;
--:-:-:-:4      F2F.F16.F32.RZ c0, c0;
04:-:-:-:0      FFMA.RZ c4, rand4, exp4, c4;
--:-:1:-:4      F2F.F16.F32.RZ c1, c1;
--:-:-:-:0      FFMA.RZ c5, rand5, exp5, c5;
--:-:-:-:4      F2F.F16.F32.RZ c2, c2;
08:-:-:-:0      FFMA.RZ c6, rand6, exp6, c6;
--:-:2:-:4      F2F.F16.F32.RZ c3, c3;
--:-:-:-:0      FFMA.RZ c7, rand7, exp7, c7;
--:-:-:-:4      F2F.F16.F32.RZ c4, c4;
--:-:3:-:4      F2F.F16.F32.RZ c5, c5;
--:-:-:-:4      F2F.F16.F32.RZ c6, c6;
--:-:4:-:1      F2F.F16.F32.RZ c7, c7;

--:-:-:-:5      RET;


// --:-:-:-:2      SHF.R.U64 rand1, rand,  4, rand;
// --:-:-:-:2      SHF.R.U64 rand2, rand,  8, rand;
// --:-:-:-:2      SHF.R.U64 rand3, rand, 12, rand;
// --:-:-:-:2      SHF.R.U64 rand4, rand, 16, rand;
// --:-:-:-:2      SHF.R.U64 rand5, rand, 20, rand;
// --:-:-:-:2      SHF.R.U64 rand6, rand, 24, rand;
// --:-:-:-:0      SHF.R.U64 rand7, rand, 28, rand;
// 
// --:-:-:-:4      I2F.F32.U32.RZ rand0, rand0;
// --:-:1:-:4      I2F.F32.U32.RZ rand1, rand1;
// --:-:-:-:4      I2F.F32.U32.RZ rand2, rand2;
// --:-:2:-:4      I2F.F32.U32.RZ rand3, rand3;
// --:-:-:-:4      I2F.F32.U32.RZ rand4, rand4;
// --:-:3:-:4      I2F.F32.U32.RZ rand5, rand5;
// --:-:-:-:4      I2F.F32.U32.RZ rand6, rand6;
// --:-:4:-:1      I2F.F32.U32.RZ rand7, rand7;
// 
// 01:-:-:-:1      FFMA.RZ c0, rand0, exp0, c0;
// --:-:-:-:1      FFMA.RZ c1, rand1, exp1, c1;
// 02:-:-:-:1      FFMA.RZ c2, rand2, exp2, c2;
// --:-:-:-:1      FFMA.RZ c3, rand3, exp3, c3;
// 04:-:-:-:1      FFMA.RZ c4, rand4, exp4, c4;
// --:-:-:-:1      FFMA.RZ c5, rand5, exp5, c5;
// 08:-:-:-:1      FFMA.RZ c6, rand6, exp6, c6;
// --:-:-:-:0      FFMA.RZ c7, rand7, exp7, c7;
// 
// --:-:-:-:4      F2F.F16.F32.RZ c0, c0;
// --:-:1:-:4      F2F.F16.F32.RZ c1, c1;
// --:-:-:-:4      F2F.F16.F32.RZ c2, c2;
// --:-:2:-:4      F2F.F16.F32.RZ c3, c3;
// --:-:-:-:4      F2F.F16.F32.RZ c4, c4;
// --:-:3:-:4      F2F.F16.F32.RZ c5, c5;
// --:-:-:-:4      F2F.F16.F32.RZ c6, c6;
// --:-:4:-:1      F2F.F16.F32.RZ c7, c7;